Image features, e.g. haralick texture features, potentially contains rich biological information. Here, I attempt to select features that are capable of differentiating living from dead organoids based on (hand chosen) positive and negative controls (DMSO).

Due to the sheer amount of features, a first attempt is made at an analysis with the averaged features for each organoid in wells. Both the trimmed mean and the trimmed standard deviation of the features for each organoid in a well are used. In addition, I averaged the values of large and small objects separately. Proper organoids most likely have an area of at least 2500 pixels, so all objects larger than this were grouped together. All objects smaller than 2500 pixels in area were also grouped and considered to be either individual cells or fragments of organoids. These two sets of summary features were concatenated for each well so that for each well, the feature set looks like:

[organoids_feature1_median, ... , organoids_featureN_median, organoids_feature1_mad, ... , organoids_featureN_mad, 
 fragments_feature1_median, ... , fragments_featureN_median, fragments_feature1_mad, ... , fragments_featureN_mad]

1 Load and Normalize Features

As I’m interested in the libraries with concentration information, I use only plates with layout L08.

all_cell_lines = unique(substr(list.files(feature_dir, pattern = "D0"), 1, 7))

pos_ctrl_treatments = "Irinotecan / SN-38|Bortezomib|Staurosporine_500nM|Volasertib|Methotrexate"

features = lapply(all_cell_lines, function(cell_line) {
  feat = get_median_features(
    cell_line = cell_line, featuredir = feature_dir, 
    configdir = config_dir, feature_type = feature_type)
  lib = substr(rownames(feat), 12, 14)
  feat = feat[lib == "L08",]
  pos_ctrl = feat[grep(pos_ctrl_treatments, feat$DRUG, ignore.case = TRUE),]
  neg_ctrl = feat[grep("DMSO", feat$DRUG, ignore.case = TRUE),]
  dat = rbind(pos_ctrl, neg_ctrl)
  dat$CELL.LINE = cell_line
  return(dat)})

# well info
cell_line_info = rep(all_cell_lines, times = sapply(features, nrow))
well_name_info = do.call(c, lapply(features, rownames))

features = data.frame(rbindlist(features, use.names = TRUE, fill = TRUE))
rownames(features) = well_name_info

metadata_cols = c("REPLICATE", "DRUG", "CONCENTRATION", "CELL.LINE")
features_metadata = features[,metadata_cols]
features[,metadata_cols] = NULL

I want to add one compound feature: the total area of the biomass in the image. That is:

\[ A = N_{Organoids} \cdot \langle A \rangle_{Organoids} + N_{Shrapnel} * \langle A \rangle_{Shrapnel}\]

inv_glog = function(y, c=0.05) {
  return(0.25 * exp(-y) * (4*exp(2*y) - c**2))
}

glog = function(x, c=0.05) {
  return(log((x + sqrt(x**2 + c**2)) / 2))
}

num_organoids = features[,"organoids_num.of.objects"]
num_shrapnel = features[,"shrapnel_num.of.objects"]
avg_area_orgs = features[,"organoids_x.0.s.area_median"]
avg_area_shrapnel = features[,"shrapnel_x.0.s.area_median"]
# logarithmic transformation
features$total.area.of.biomass = glog(
  (inv_glog(num_organoids) * inv_glog(avg_area_orgs)) + 
  (inv_glog(num_shrapnel) * inv_glog(avg_area_shrapnel)))

To avoid plate batch effects, I normalize all features by the mean and standard deviation of the negative controls (DMSO). The features were already trimmed when the averages over each well were calculated so that further trimming is most likely unnecessary. Features that show no variation across DMSO controls on any single plate are removed as they are most likely unreliable.

# Remove features that are constant across DMSO wells for any single plate
dmso_features = features[features_metadata$DRUG == "DMSO",]
plates = substr(rownames(dmso_features), 1, 14)
dmso_sd = aggregate(dmso_features, by = list(plates), sd)
rownames(dmso_sd) = dmso_sd$Group.1
dmso_sd$Group.1 = NULL
features[,names(which(colSums(dmso_sd == 0) > 0))] = NULL

# Normalize remaining features on mean and sd of dmso controls
plates = substr(rownames(features), 1, 14)
for(plate in unique(plates)) {
  plate_feats = features[plate == plates,]
  dmso_feats = features[plate == plates & features_metadata$DRUG == "DMSO",]
  m = apply(dmso_feats, 2, mean)
  m = matrix(
    rep(m, nrow(plate_feats)), 
    nrow = nrow(plate_feats), 
    ncol = ncol(plate_feats), 
    byrow = TRUE)
  s = apply(dmso_feats, 2, sd)
  s = matrix(
    rep(s, nrow(plate_feats)), 
    nrow = nrow(plate_feats), 
    ncol = ncol(plate_feats), 
    byrow = TRUE)
  features[plate == plates,] = (features[plate == plates,] - m) / s
}

2 Positive controls

The drugs Bortezomib, Irinotecan / SN-38, Staurosporine, Volasertib, and Methotrexate are all potential positive controls. To see how suitable they are for this, I look at how they compare to DMSO, the negative control. To compare treatments, I look at the number of organoids as a proxy for organoid survival.

sf_drug = features_metadata[,"DRUG"]
sf_conc = features_metadata[,"CONCENTRATION"]
sf_drug_conc = ifelse(
  test = sf_drug %in% c("DMSO", "Staurosporine_500nM"), 
  yes = sf_drug, no = paste0(sf_drug, " @ ", sf_conc))
sf_num_organoids = features[,"organoids_num.of.objects"]
sf_avg_org_size = features[,"organoids_x.0.s.area_median"]
sf_total_area = features[,"total.area.of.biomass"]

simple_features = data.frame(
  "Num.Objects" = sf_num_organoids,
  "Avg.Org.Size" = sf_avg_org_size,
  "Total.Area" = sf_total_area,
  "Drug" = sf_drug,
  "Concentration" = sf_conc,
  "Drug_Conc" = sf_drug_conc,
  "Cell.Line" = substr(rownames(features), 1, 4)
)

2.1 Comparing Positive Controls

2.1.1 Staurosporine_500nM

Staurosporine and DMSO were only applied at one concentration.

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Staurosporine_500nM"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Staurosporine_500nM)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Staurosporine_500nM)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Staurosporine_500nM)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.2 Bortezomib

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Bortezomib"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Bortezomib)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Bortezomib)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Bortezomib)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.3 Irinotecan / SN-38

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Irinotecan / SN-38"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Irinotecan / SN-38)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Irinotecan / SN-38)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Irinotecan / SN-38)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.4 Volasertib

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Volasertib"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Volasertib)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Volasertib)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Volasertib)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.1.5 Methotrexate

ggplotdf = simple_features[simple_features$Drug %in% c("DMSO", "Methotrexate"),]
ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug_Conc)) + 
  ggtitle(label = "Number of Organoids per Well (Methotrexate)") + xlab("Cell Line") + 
  ylab("Number of Organoids")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug_Conc)) + 
  ggtitle(label = "Average Are of Organoids (Methotrexate)") + 
  xlab("Cell Line") + ylab("Average Area")

ggplot(data = ggplotdf) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug_Conc)) + 
  ggtitle(label = "Total Biomass per Well (Methotrexate)") + 
  xlab("Cell Line") + ylab("Total Biomass Area")

2.2 Combining Positive Controls

With a few exceptions in individual cell lines, the highest concentrations of positive controls (“0.2” and “1”) seem to be well differentiable from the negative controls with regards to the number of organoids in each well. Grouping the positive controls leads to a larger within-group variance but still leaves them clearly differentiable from the negative controls.

valid_entries = features_metadata$DRUG %in% c("DMSO", "Staurosporine_500nM") | 
  features_metadata$CONCENTRATION %in% c("0.2", "1")
features_combined = features[valid_entries,]
features_metadata_combined = features_metadata[valid_entries,]
features_metadata_combined$DRUG = ifelse(
  test = features_metadata_combined$DRUG == "DMSO",
  yes = "NEG_CTRL", no = "POS_CTRL")
simple_features = data.frame(
  "Num.Objects" = features_combined[,"organoids_num.of.objects"],
  "Avg.Org.Size" = features_combined[,"organoids_x.0.s.area_median"],
  "Total.Area" = features_combined[,"total.area.of.biomass"],
  "Drug" = features_metadata_combined[,"DRUG"],
  "Cell.Line" = substr(rownames(features_combined), 1, 4)
)
ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug)) +
  ggtitle(label = "Number of Organoids per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Number of Organoids")

ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Avg.Org.Size, fill = Drug)) +
  ggtitle(label = "Average Area of Organoids per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Average Area of of Organoids")

ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Total.Area, fill = Drug)) +
  ggtitle(label = "Total Area of Biomass per Well",
          subtitle = "Grouped positive controls at concentrations of 1 and 0.2") + xlab("Cell Line") +
  ylab("Total Area of Biomass")

3 Finding Characteristic Features

Next, I am interested in finding features that allow for a similar separation between positive and negative controls. The two conditions I impose are:

Since I’m only interested in a ranking and not a formal statistical test for the separability of the distributions, I simply look at the ratio of within-group to between-group variances.

# Begin by removing those features without a unanimous direction between positive and negative
# control.
permitted_outliers = 1

pos_ctrl = features_combined[features_metadata_combined$DRUG == "POS_CTRL",]
pos_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "POS_CTRL",]
pos_ctrl_mean = aggregate(pos_ctrl, list(pos_ctrl_metadata$CELL.LINE), mean)
rownames(pos_ctrl_mean) = pos_ctrl_mean$Group.1
pos_ctrl_mean$Group.1 = NULL

neg_ctrl = features_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
neg_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
neg_ctrl_mean = aggregate(neg_ctrl, list(neg_ctrl_metadata$CELL.LINE), mean)
rownames(neg_ctrl_mean) = neg_ctrl_mean$Group.1
neg_ctrl_mean$Group.1 = NULL

pos_neg_dir = (pos_ctrl_mean - neg_ctrl_mean) > 0
invalid_features = names(which(
  (colSums(pos_neg_dir) > permitted_outliers) & 
  (colSums(pos_neg_dir) < (nrow(pos_neg_dir) - permitted_outliers))))
features_combined[,invalid_features] = NULL

# Now rank features by the ratio of their between-group to within-group ratio
pos_ctrl = features_combined[features_metadata_combined$DRUG == "POS_CTRL",]
pos_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "POS_CTRL",]
pos_ctrl_var = aggregate(pos_ctrl, list(pos_ctrl_metadata$CELL.LINE), var)
rownames(pos_ctrl_var) = pos_ctrl_var$Group.1
pos_ctrl_var$Group.1 = NULL

# Negative controls
neg_ctrl = features_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
neg_ctrl_metadata = features_metadata_combined[features_metadata_combined$DRUG == "NEG_CTRL",]
neg_ctrl_var = aggregate(neg_ctrl, list(neg_ctrl_metadata$CELL.LINE), var)
rownames(neg_ctrl_var) = neg_ctrl_var$Group.1
neg_ctrl_var$Group.1 = NULL

within_group_var = (pos_ctrl_var + neg_ctrl_var) / 2

# Between-group variance
between_group_var = aggregate(features_combined, list(features_metadata_combined$CELL.LINE), var)
rownames(between_group_var) = between_group_var$Group.1
between_group_var$Group.1 = NULL

# Calculate ratio and apply a trimmed mean (trim 2 cell lines off both ends)
var_ratio = between_group_var / within_group_var
var_ratio_trimmed_mean = apply(var_ratio, 2, function(x) {
  s = sort(x)
  s = s[3:(length(s) - 2)]
  mean(s)
})

qplot(
  x = seq_along(var_ratio_trimmed_mean), y = sort(var_ratio_trimmed_mean, decreasing = TRUE), 
  xlab = "Features", ylab = "Variance Ratio", 
  main = "Between-Group Variance / Within-Group Variance")

The number of organoids, as investigated above, are a good indicator of organoid viability, so I will use the variance ratio for this feature as the lower threshold for the feature selection. Any feature with a better separability and a near-unanimous effect direction between controls can be considered a potential indicator of organoid viability.

lower_thresh = var_ratio_trimmed_mean["organoids_num.of.objects"]
possible_features = names(which(var_ratio_trimmed_mean >= lower_thresh))
features_combined = features_combined[,possible_features]

As a final step, I want to make sure that the features weren’t chosen simply because they correlate strongly. Therefore, I apply a stepwise feature selection, as per Fischer et al. 2015 (“A Map of Directional Genetic Interactions in a Metazoan Cell”) and Laufer et al. 2013 (“Mapping genetic interactions in human cancer cells”). Feature selection is performed on the combined feature set for all cell lines. This entails the following steps: 1. Choose an initial feature set. This will be the number of organoids and the total biomass, both of which suitably separated the positive from negative controls. 2. Model each replicate of the remaining features as a function of the already chosen feature set. Since I am interested in extracting features with the highest signal-to-noise ratio, the feature with the highest correlation between replicate residuals is added to the feature set. 3. Repeat step 2 with the new feature set until the distribution of the correlation of all remaining residuals is centered around 0, i.e. none of the remaining features add any information.

# Choose the best three as a starting set
initial_set = c("organoids_num.of.objects", "total.area.of.biomass")
selected_features = initial_set
remaining_features = colnames(features_combined)[
  !colnames(features_combined) %in% selected_features]

# Remember statistics on the correlations
all_correlations = list()
ratio_positive = c()

# Calculate the fit for each replicate and save the residuals
while(length(remaining_features) > 0) {
  residuals = array(0, dim = c(nrow(features_combined), 2, length(remaining_features)))
  dimnames(residuals) = list(NULL, NULL, remaining_features)
  for(feat in remaining_features) {
    model1 = lm(
    features_combined[features_metadata_combined$REPLICATE == 1, feat] ~
      as.matrix(features_combined[features_metadata_combined$REPLICATE == 1, selected_features]))
    model2 = lm(
      features_combined[features_metadata_combined$REPLICATE == 2, feat] ~
        as.matrix(features_combined[features_metadata_combined$REPLICATE == 2, selected_features]))
    residuals[,1,feat] = model1$residuals
    residuals[,2,feat] = model2$residuals
  }

  # Calculate the correlation
  correlations = apply(residuals, 3, function(x) {
    x1 = x[,1]
    x2 = x[,2]
    I = which(is.finite(x1) & is.finite(x2))
    cor(x1[I], x2[I])
  })

  to_select = names(correlations)[which.max(correlations)]
  selected_features = c(selected_features, to_select)
  remaining_features = colnames(features_combined)[
    !colnames(features_combined) %in% selected_features]
  all_correlations = c(all_correlations, list(correlations))
  ratio_positive = c(ratio_positive, sum(correlations > 0, na.rm = TRUE) / length(correlations))
}

# Keep only the selected features with a ratio > 0.5 of positively correlated residual matrices
selected_features = selected_features[c(rep(TRUE, length(initial_set)), ratio_positive > 0.5)]
ratio_color = ifelse(ratio_positive >= 0.5, "1", "2")
ggplot(data = data.frame(
  "Iteration" = seq_along(ratio_positive),
  "Ratio" = ratio_positive-0.5)) +
  geom_bar(aes(x = Iteration, y = Ratio, fill = ratio_color), stat = "identity") +
  ggtitle("Ratio of features with positive correlation after each iteration") +
  theme(legend.position = "none") + scale_y_continuous(labels = function(x) x+0.5)

What remains are (length(selected_features)) features used to describe organoid viability

print(selected_features)
##  [1] "organoids_num.of.objects"        "total.area.of.biomass"          
##  [3] "shrapnel_x.b.h.f12.s1_median"    "shrapnel_x.Bc.h.asm.s4_median"  
##  [5] "shrapnel_x.Bc.b.q05_median"      "organoids_x.c.b.q005_mad"       
##  [7] "organoids_x.Bc.h.asm.s16_median" "shrapnel_x.Bc.b.q05_mad"        
##  [9] "shrapnel_x.Bbc.h.f12.s2_median"  "shrapnel_x.Bb.h.den.s4_median"  
## [11] "organoids_x.0.s.radius.sd_mad"   "shrapnel_x.c.b.q05_median"      
## [13] "organoids_x.Bc.h.idm.s8_median"  "organoids_x.c.b.q05_median"     
## [15] "organoids_x.Bc.h.idm.s4_median"  "organoids_x.Bc.h.idm.s2_median" 
## [17] "shrapnel_x.Bb.h.f12.s2_median"   "shrapnel_x.Bb.h.con.s16_mad"    
## [19] "organoids_x.Bc.b.q05_median"     "organoids_x.Bc.b.q05_mad"       
## [21] "organoids_x.Bc.h.asm.s8_median"  "organoids_x.Bc.h.asm.s4_median" 
## [23] "shrapnel_x.b.h.cor.s1_median"
features_combined = features_combined[,selected_features]

An few examples for these features are shown below

feature_of_interest = colnames(features_combined)[3]
simple_features = data.frame(
  "Num.Objects" = features_combined[, feature_of_interest],
  "Num.Objects.Noise" = features_combined[, feature_of_interest],
  "Drug" = features_metadata_combined[, "DRUG"],
  "Cell.Line" = substr(rownames(features_combined), 1, 4)
)
ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug)) +
  ggtitle(label = sprintf("Feature '%s'", feature_of_interest)) + 
  xlab("Cell Line") + ylab("Number of Organoids")

feature_of_interest = colnames(features_combined)[6]
simple_features = data.frame(
  "Num.Objects" = features_combined[, feature_of_interest],
  "Num.Objects.Noise" = features_combined[, feature_of_interest],
  "Drug" = features_metadata_combined[, "DRUG"],
  "Cell.Line" = substr(rownames(features_combined), 1, 4)
)
ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug)) +
  ggtitle(label = sprintf("Feature '%s'", feature_of_interest)) + 
  xlab("Cell Line") + ylab("Number of Organoids")

feature_of_interest = colnames(features_combined)[20]
simple_features = data.frame(
  "Num.Objects" = features_combined[, feature_of_interest],
  "Num.Objects.Noise" = features_combined[, feature_of_interest],
  "Drug" = features_metadata_combined[, "DRUG"],
  "Cell.Line" = substr(rownames(features_combined), 1, 4)
)
ggplot(data = simple_features) + geom_boxplot(aes(x = Cell.Line, y = Num.Objects, fill = Drug)) +
  ggtitle(label = sprintf("Feature '%s'", feature_of_interest)) + 
  xlab("Cell Line") + ylab("Number of Organoids")